{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas\n",
    "import os.path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def getData(fromCache=True):\n",
    "    # Retrieved 13-06-2016 from http://factfinder.census.gov/faces/nav/jsf/pages/searchresults.xhtml?refresh=t\n",
    "    dataDirectory = '../../data/census/ACS_2013/'\n",
    "    outputName = 'acs2013_clean.csv'\n",
    "    \n",
    "    if fromCache & os.path.isfile(dataDirectory + outputName):\n",
    "        return pandas.read_csv(dataDirectory + outputName, header=0)\n",
    "    \n",
    "    else:\n",
    "        # Retrieved 10-07-2016 from http://factfinder.census.gov/bkmk/table/1.0/en/ACS/13_5YR/B15003/0500000US04007.15000|0500000US04013.15000|0500000US04021.15000|0500000US04025.15000\n",
    "        education = pandas.read_csv(dataDirectory + 'educational_attainment/ACS_13_5YR_B15003_with_ann.csv', \n",
    "                                    header=1)\n",
    "        education['highschool'] = (education['Estimate; Total: - Regular high school diploma'] + \n",
    "                                   education['Estimate; Total: - GED or alternative credential']\n",
    "                                  )\n",
    "        education['undergraduate'] = (education['Estimate; Total: - Associate\\'s degree'] +\n",
    "                                      education['Estimate; Total: - Bachelor\\'s degree']\n",
    "                                     )\n",
    "        education['postgraduate'] = (education['Estimate; Total: - Master\\'s degree'] +\n",
    "                                     education['Estimate; Total: - Professional school degree'] +\n",
    "                                     education['Estimate; Total: - Doctorate degree']\n",
    "                                    )\n",
    "        # Retrieved 10-07-2016 from http://factfinder.census.gov/bkmk/table/1.0/en/ACS/13_5YR/B25077/0500000US04007.15000|0500000US04013.15000|0500000US04021.15000|0500000US04025.15000\n",
    "        homeValue = pandas.read_csv(dataDirectory + 'median_home_value/ACS_13_5YR_B25077_with_ann.csv',  \n",
    "                                    header=1)\n",
    "        # Retrieved 10-07-2016 from http://factfinder.census.gov/bkmk/table/1.0/en/ACS/13_5YR/B19013/0500000US04007.15000|0500000US04013.15000|0500000US04021.15000|0500000US04025.15000\n",
    "        income = pandas.read_csv(dataDirectory + 'median_household_income/ACS_13_5YR_B19013_with_ann.csv',  \n",
    "                                 header=1)\n",
    "\n",
    "        selectedData = (pandas.DataFrame(homeValue[['Id2', 'Estimate; Median value (dollars)']])\n",
    "                        .merge(education[['Id2',\n",
    "                                          'highschool',\n",
    "                                          'undergraduate',\n",
    "                                          'postgraduate'\n",
    "                                        ]],\n",
    "                               on='Id2', how='outer'\n",
    "                             )\n",
    "                        .merge(income[['Id2', \n",
    "                                       'Estimate; Median household income in the past 12 months (in 2013 inflation-adjusted dollars)'\n",
    "                                      ]],\n",
    "                               on='Id2', how='outer'                               \n",
    "                              )\n",
    "                        .rename(columns={'Id2': 'GEOID',\n",
    "                                         'Estimate; Median value (dollars)': 'median_home_value',\n",
    "                                         'highschool': 'education_highschool',\n",
    "                                         'undergraduate': 'education_undergraduate',\n",
    "                                         'postgraduate': 'education_postgraduate',\n",
    "                                         'Estimate; Median household income in the past 12 months (in 2013 inflation-adjusted dollars)': \n",
    "                                         'median_household_income'\n",
    "                                        }\n",
    "                               )\n",
    "                        )\n",
    "        \n",
    "        # remove character artifacts from income data        \n",
    "        selectedData['median_home_value'] = pandas.to_numeric(\n",
    "            selectedData['median_home_value']\n",
    "            .str.strip('><+-')\n",
    "            .str.replace(',', '')\n",
    "        )\n",
    "        \n",
    "        selectedData['median_household_income'] = pandas.to_numeric(\n",
    "            selectedData['median_household_income']\n",
    "            .str.strip('><+-')\n",
    "            .str.replace(',', '')\n",
    "        )\n",
    "        \n",
    "        selectedData.fillna(selectedData.mean(), inplace=True)\n",
    "    \n",
    "        selectedData.to_csv(dataDirectory + outputName, index=False)\n",
    "        return selectedData"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
